{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# This notebook combines predictions of a set of models \n",
    "# using by training a regression model on the outputs of the\n",
    "# models.\n",
    "run_names = [\n",
    "    'tagging/7_11_17/dense_ensemble2/0',\n",
    "    'tagging/7_11_17/dense_ensemble2/1',\n",
    "    'tagging/7_11_17/dense_ensemble2/2',\n",
    "    'tagging/7_11_17/dense_ensemble2/3',\n",
    "    'tagging/7_11_17/dense_ensemble2/4'\n",
    "]\n",
    "\n",
    "train_probs_fn = 'train_probs.npy'\n",
    "train_preds_fn = 'train_preds.csv'\n",
    "\n",
    "val_probs_fn = 'validation_probs.npy'\n",
    "val_preds_fn = 'validation_preds.csv'\n",
    "\n",
    "s3_bucket = 'raster-vision'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from os.path import join\n",
    "\n",
    "import boto3\n",
    "from botocore.exceptions import ClientError\n",
    "\n",
    "from rastervision.common.settings import results_path\n",
    "from rastervision.common.utils import _makedirs\n",
    "\n",
    "def s3_download(run_name, file_name, new_file_name=None):\n",
    "    if new_file_name is None:\n",
    "        new_file_name = file_name\n",
    "    s3_key = 'results/{}/{}'.format(run_name, file_name)\n",
    "    run_path = join('/opt/data/results/', run_name, new_file_name)\n",
    "    s3 = boto3.resource('s3')\n",
    "    s3.Bucket(s3_bucket).download_file(s3_key, run_path)\n",
    "    \n",
    "def download_run(run_name):\n",
    "    s3_download(run_name, train_probs_fn)\n",
    "    s3_download(run_name, train_preds_fn)\n",
    "    \n",
    "    s3_download(run_name, val_probs_fn)\n",
    "    s3_download(run_name, val_preds_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tagging/7_11_17/dense_ensemble2/0\n",
      "tagging/7_11_17/dense_ensemble2/1\n",
      "tagging/7_11_17/dense_ensemble2/2\n",
      "tagging/7_11_17/dense_ensemble2/3\n",
      "tagging/7_11_17/dense_ensemble2/4\n"
     ]
    }
   ],
   "source": [
    "for run_name in run_names:\n",
    "    print(run_name)\n",
    "    download_run(run_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(32383, 85)\n",
      "(8096, 85)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "train_probs = []\n",
    "val_probs = []\n",
    "for run_name in run_names:\n",
    "    run_path = join('/opt/data/results/', run_name)\n",
    "    train_probs_path = join(run_path, train_probs_fn)\n",
    "    train_probs.append(np.load(train_probs_path))\n",
    "        \n",
    "    val_probs_path = join(run_path, val_probs_fn)\n",
    "    val_probs.append(np.load(val_probs_path))\n",
    "    \n",
    "all_train_probs = np.concatenate(train_probs, axis=1)\n",
    "all_val_probs = np.concatenate(val_probs, axis=1)\n",
    "print(all_train_probs.shape)\n",
    "print(all_val_probs.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from os.path import splitext, basename\n",
    "import glob\n",
    "\n",
    "\n",
    "def generate_file_inds(path):\n",
    "    paths = sorted(\n",
    "        glob.glob(join(path, '*.{}'.format('jpg'))))\n",
    "\n",
    "    file_inds = []\n",
    "    for path in paths:\n",
    "        file_ind = splitext(basename(path))[0]\n",
    "        file_inds.append(file_ind)\n",
    "    return file_inds\n",
    "\n",
    "dev_file_inds = generate_file_inds('/opt/data/datasets/planet_kaggle/train-jpg')\n",
    "nb_train_inds = int(len(dev_file_inds) * 0.8)\n",
    "train_file_inds = dev_file_inds[0:nb_train_inds]\n",
    "val_file_inds = dev_file_inds[nb_train_inds:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from rastervision.tagging.data.planet_kaggle import TagStore\n",
    "from pandas import read_csv\n",
    "\n",
    "gt_csv_path = '/opt/data/datasets/planet_kaggle/train_v2.csv'\n",
    "gt_tag_store = TagStore(gt_csv_path)\n",
    "\n",
    "gt_train_preds = gt_tag_store.get_tag_array(train_file_inds)\n",
    "gt_val_preds = gt_tag_store.get_tag_array(val_file_inds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from numpy import loadtxt\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.multioutput import MultiOutputClassifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]"
     ]
    }
   ],
   "source": [
    "ntags = 17\n",
    "models = [None] * ntags\n",
    "\n",
    "for tag_ind in range(ntags):\n",
    "    models[tag_ind] = LogisticRegression(verbose=10)\n",
    "    models[tag_ind].fit(all_train_probs, gt_train_preds[:, tag_ind])\n",
    "    #models[tag_ind] = XGBClassifier()\n",
    "    #models[tag_ind].fit(all_train_probs, gt_train_preds[:, tag_ind], eval_set=[(all_val_probs, gt_val_preds[:, tag_ind])], eval_metric='error', early_stopping_rounds=10, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "val_preds = []\n",
    "for tag_ind in range(ntags):\n",
    "    val_preds.append(models[tag_ind].predict_proba(all_val_probs)[:, 1][:, np.newaxis])\n",
    "val_preds = np.concatenate(val_preds, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "thresholds = [\n",
    "    0.20000,\n",
    "    0.66000,\n",
    "    0.33000,\n",
    "    0.39000,\n",
    "    0.30000,\n",
    "    0.20000,\n",
    "    0.09000,\n",
    "    0.60000,\n",
    "    0.24000,\n",
    "    0.18000,\n",
    "    0.20000,\n",
    "    0.24000,\n",
    "    0.21000,\n",
    "    0.20000,\n",
    "    0.45000,\n",
    "    0.18000,\n",
    "    0.21000\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.93100344671\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import fbeta_score\n",
    "\n",
    "our_val_preds = val_preds > 0.11\n",
    "#our_val_preds = val_preds > 0.15\n",
    "f2 = fbeta_score(\n",
    "    gt_val_preds, our_val_preds, beta=2, average='samples')\n",
    "print(f2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
